/********************************Validating the equivalence of US-EP twins********************************
Main purpose: Validate the equivalence of US-EP twins
dataset: 
valid_twin: pairwise IPC7-based cosine similarity and text-based similarity for US-EP twins in our sample or fake twins (keeping the US patents fixed and match EP equivalences to a random EP patents with the same application quarter, 4-digit IPC code, and grant status.
********************************Validating the equivalence of US-EP twins********************************/


log using "$DATA\twin_validation_log.txt", replace
**********************Table D1 Panel B: Equivalence of twins**********************
use "$DATA\valid_twin", clear
tabstat ipc7_twin ipc7_match , stat(mean median) column(stat)
ttest ipc7_match= ipc7_twin  //test mean different for paired variables
signtest ipc7_match= ipc7_twin //test median different for paired variables

tabstat esim_twin esim_match, stat(mean median) column(stat)
ttest esim_match= esim_twin //test mean different for paired variables
signtest esim_match= esim_twin //test mean different for paired variables
log close






/**********************construction of "valid_twin.dta" from raw data**********************
to construct the data, one need the following data in the current directory
1) US-EP twin sample:
us_ep_twin.dta
2) all EP filings from PATSTAT:
tls201_ep.dta extracted from tls201 tables from PATSTAT after restricting to applications filed at EPO
3) primary IPC 4-digit codes and assignments of IPC 7-digit codes:
tls209ipcmain.dta: primary IPC 4-digit codes, one application per observation
tls209ipc7.dta: assignments of IPC 7-digit codes. As one application could have multiple IPC codes, the data is unique at application-IPC7 level. 
4) word embedding vectors from Google (via BigQuery)
embedding_sample.dta: word embedding vectors for patents in us_ep_twin.dta, one patent per observation
pat_embedding_sample.dta: link EP patents to embedding data (appln_id and publication_number)
embedding_ep.dta: word embedding vectors for EP patents (ID=publication_number) 


use "$DATA\tls201_ep",clear
keep appln_id appln_filing_date grant 
joinby appln_id using "$DATA\tls209ipcmain", unmatched(master)
tab _merge 
/*
          only in master data |      4,287        0.12        0.12
both in master and using data |  3,572,276       99.88      100.00
*/
keep if _merge==3
drop _merge 
gen app_qofd=qofd(appln_filing_date)
format app_qofd %tq 
drop appln_filing_date
tempfile epsample 
save `epsample', replace 

//to be matched sample
rename appln_id appln_id2 
tempfile epsample2 
save `epsample2', replace 

use "$DATA\us_ep_twin",clear
keep if uspto==0
keep appln_id 
duplicates drop
joinby appln_id using `epsample', unmatched(master)
tab _merge //all matched
keep if _merge==3
drop _merge 

count 
joinby granted ipcmain app_qofd using `epsample2', unmatched(master)
tab _merge //all matched, 57,500,759 obs 
drop _merge
*select a random one
set seed 123456 
gen u=uniform()
sort appln_id u 
by appln_id: keep if _n<=10 
//take the first 10 to make sure at least one match could be found later 
keep appln_id* u
saveold "$DATA\ep_match" , replace version(12)



******************esim similiarity between US-EP******************
use "$DATA\us_ep_twin",clear
keep  appln_id
duplicates drop 
joinby appln_id using  "$DATA\embedding_sample", unmatched(master)
tab _merge 
/*
          only in master data |         77        0.01        0.01
both in master and using data |    641,851       99.99      100.00
*/
keep if _merge==3
drop _merge 
tempfile twin_embedding 
save `twin_embedding', replace 


use "$DATA\us_ep_twin",clear
keep if uspto==1 //US
keep appln_id_us 
duplicates drop
rename appln_id_us appln_id
joinby appln_id using `twin_embedding'
rename appln_id appln_id_us 
rename embed* a_* 
tempfile twin_a 
save `twin_a', replace 

use "$DATA\us_ep_twin",clear
keep if uspto==0 //EP
keep appln_id_us appln_id
joinby appln_id using `twin_embedding'
rename embed* b_*
joinby appln_id_us using `twin_a', unmatched(master)
tab _merge 
keep if _merge==3
drop _merge 

forvalues i=1/64{
gen pro`i'=a_`i'*b_`i'
drop a_`i' b_`i'
}

egen esim_twin=rowtotal(pro1-pro64)
drop pro1-pro64 
keep appln_id* esim_twin
label var appln_id_us "US branch"
label var appln_id "EP equivalent"
saveold "$DATA\esim_twin", replace version(12) //uni at appln_id_us appln_id 


******************esim similiarity between US-EP_match******************
use "$DATA\us_ep_twin",clear
keep  appln_id
duplicates drop 
joinby appln_id using  "$DATA\embedding_sample", unmatched(master)
tab _merge 
/*
          only in master data |         77        0.01        0.01
both in master and using data |    641,851       99.99      100.00
*/
keep if _merge==3
drop _merge 
tempfile twin_embedding 
save `twin_embedding', replace 


use "$DATA\us_ep_twin",clear
keep if uspto==1 //US
keep appln_id_us 
duplicates drop
rename appln_id_us appln_id
joinby appln_id using `twin_embedding'
rename appln_id appln_id_us 
rename embed* a_* 
tempfile twin_a 
save `twin_a', replace 

use "$DATA\us_ep_twin",clear
keep if uspto==0 //EP
keep appln_id_us appln_id
*get EP's match 
joinby appln_id using "$DATA\ep_match", unmatched(master)
tab _merge 
keep if _merge==3
drop _merge 
rename appln_id appln_id_oep 
label var appln_id_oep "original EP appln_id"
label var appln_id2 "matched EP appln_id"
rename appln_id2 appln_id
joinby appln_id using "$DATA\pat_embedding_sample", unmatched(master)
tab _merge 
keep if _merge==3
drop _merge 
sort appln_id_us appln_id_oep u
by appln_id_us appln_id_oep: keep if _n==1 //one-to-one match
count 
*get embedings for EP-match 
rename pat_num publication_number 
joinby publication_number using "$DATA\embedding_ep", unmatched(master)
tab _merge //all matched 
rename embed* b_* 
rename appln_id appln_id_match
rename appln_id_oep appln_id
*get embedings for US patents 
keep appln_id* b_* 
joinby appln_id_us using `twin_a', unmatched(master)
tab _merge 
keep if _merge==3
drop _merge 

forvalues i=1/64{
gen pro`i'=a_`i'*b_`i'
drop a_`i' b_`i'
}

egen esim_match=rowtotal(pro1-pro64)
drop pro1-pro64 
keep appln_id* esim_match
joinby appln_id_us appln_id using "$DATA\esim_twin", unmatched(both)
tab _merge 
keep if _merge==3
drop _merge 
saveold "$DATA\esim_twin", replace version(12)



******************IPC7 similiarity between US-EP******************
use "$DATA\tls209ipc7",clear //IPC7 assignment percentage and norm
keep appln_id norm 
duplicates drop 
rename appln_id appln_id_us 
rename norm a_norm
tempfile a_norm
save `a_norm', replace 


use "$DATA\us_ep_twin",clear
keep if uspto==1 //US
keep appln_id_us 
duplicates drop
rename appln_id_us appln_id
joinby appln_id using "$DATA\tls209ipc7", unmatched(master)
tab _merge 
keep if _merge==3
drop _merge
rename appln_id appln_id_us 
rename ipc7pct a_ipc7pct
drop norm
tempfile twin_a 
save `twin_a', replace 

use "$DATA\us_ep_twin",clear
keep if uspto==0 //EP
keep appln_id_us appln_id
joinby appln_id using  "$DATA\tls209ipc7", unmatched(master)
tab _merge 
keep if _merge==3
drop _merge
rename ipc7pct b_ipc7pct 
rename norm b_norm
*get a_norm (need to separate from US patent's ipc share bcs every patent has a norm, but not every patent has IPC7 matched)
joinby appln_id_us using `a_norm', unmatched(master)
tab _merge //8 unmatched
drop _merge 
*get IPC7 shares 
joinby appln_id_us ipc7 using `twin_a', unmatched(master)
tab _merge 
gen prod=a_ipc7pct*b_ipc7pct
bys appln_id_us appln_id: egen ipc7_twin=total(prod)
replace ipc7_twin=ipc7_twin/sqrt(a_norm)/sqrt(b_norm)
keep appln_id* ipc7_twin 
duplicates drop
label var appln_id_us "US branch"
label var appln_id "EP equivalent"
saveold "$DATA\ipc7_twin", replace version(12) //uni at appln_id_us appln_id 



******************ipc7 similiarity between US-EP_match******************
use "$DATA\tls209ipc7",clear
keep appln_id norm 
duplicates drop 
rename appln_id appln_id_us 
rename norm a_norm
tempfile a_norm
save `a_norm', replace 

use "$DATA\us_ep_twin",clear
keep if uspto==1 //US
keep appln_id_us 
duplicates drop
rename appln_id_us appln_id
joinby appln_id using "$DATA\tls209ipc7", unmatched(master)
tab _merge 
keep if _merge==3
drop _merge
rename appln_id appln_id_us 
rename ipc7pct* a_ipc7pct* 
tempfile twin_a 
save `twin_a', replace 

use "$DATA\esim_twin",clear
drop esim*
rename appln_id  appln_id_oep 
rename appln_id_match appln_id 
*get match's IPC assignment
joinby appln_id using "$DATA\tls209ipc7", unmatched(master)
tab _merge //all matched
keep if _merge==3
drop _merge 
gen temp=ipc7pct^2
bys appln_id*: egen b_norm=total(temp)
drop temp
rename ipc7pct b_ipc7pct
rename appln_id appln_id_match
rename appln_id_oep appln_id
*get a_norm (need to separate from US patent's ipc share bcs every patent has a norm, but not every patent has IPC7 matched)
joinby appln_id_us using `a_norm', unmatched(master)
tab _merge 
drop _merge
*get IPC7 share
 joinby appln_id_us ipc7 using `twin_a', unmatched(master)
tab _merge 
gen prod=a_ipc7pct*b_ipc7pct
bys appln_id_us appln_id: egen ipc7_match=total(prod)
replace ipc7_match=ipc7_match/sqrt(a_norm)/sqrt(b_norm)
keep appln_id* ipc7_match 
duplicates drop
label var appln_id_us "US branch"
label var appln_id "EP equivalent"

joinby appln_id_us appln_id using "$DATA\ipc7_twin", unmatched(both)
tab _merge 
keep if _merge==3
drop _merge 
saveold "$DATA\ipc7_twin", replace version(12)

use "$DATA\ipc7_twin",clear
joinby appln_id_us appln_id appln_id_match using "$DATA\esim_twin", unmatched(both)
drop _merge 
saveold "$DATA\valid_twin", replace version(12)
rm "$DATA\ipc7_twin.dta"
rm "$DATA\esim_twin.dta"
*/
